In [1]:
import pandas as pd
import itertools
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

folder_path = '../assets/Assignment_Data_2023-2024/Assignment_Data_2023-2024/'

club_games_df = pd.read_csv(folder_path+'club_games.csv')
clubs_df = pd.read_csv(folder_path+'clubs.csv')
competition_df = pd.read_csv(folder_path+'competitions.csv')
game_lineup_df = pd.read_csv(folder_path+'game_lineups.csv')
players_df = pd.read_csv(folder_path+'players.csv')
appearances_df = pd.read_csv(folder_path+'appearances.csv')
In [2]:
'''
in dataset to abbreviate millions there is m near the number, we have to clean that in order to make the conversion
to a number 
also excluding negative values 
'''

# Clean the 'net_transfer_record' column and convert to numeric
clubs_df['net_transfer_record'] = clubs_df['net_transfer_record'].replace(r'[\$,€m]', '', regex=True)
clubs_df['net_transfer_record'] = pd.to_numeric(clubs_df['net_transfer_record'], errors='coerce')

# Merge the dataframes on competition ID
merged_df = pd.merge(clubs_df, competition_df[['competition_id', 'competition_code']], left_on='domestic_competition_id', right_on='competition_id')

# Group by competition code, club name, and last season, and sum the net transfer records
club_transfer_analysis = merged_df.groupby(['competition_code', 'name', 'last_season'])['net_transfer_record'].sum().reset_index()

def process_and_plot(group):
    pivot_table = group.pivot(index='last_season', columns='name', values='net_transfer_record')
    
    # Select last 3 seasons, handle cases with fewer than 3 seasons
    if len(pivot_table.index) > 2:
        last_3_years = pivot_table.index[-3:]
    else:
        last_3_years = pivot_table.index

    pivot_table = pivot_table.loc[last_3_years]

    # Replace negative values with NaN
    pivot_table = pivot_table.where(pivot_table >= 0)

    # Prepare data for Plotly
    data = []
    for column in pivot_table.columns:
        trace = go.Scatter(
            x=pivot_table.index,
            y=pivot_table[column],
            mode='lines+markers',
            name=column
        )
        data.append(trace)

    # Layout settings for the plot
    layout = go.Layout(
        title=f'Net Transfer Spending Over Seasons - {group.name}',
        xaxis=dict(title='Season'),
        yaxis=dict(title='Net Transfer Spending (in millions)'),
        legend=dict(title='Club', x=1.05, y=1, orientation='v'),
        margin=dict(l=50, r=50, t=80, b=50),
        hovermode='closest',
        showlegend=True
    )

    # Create and display the plot using Plotly
    fig = go.Figure(data=data, layout=layout)
    fig.show()

# Apply the function to each group in the groupby object
club_transfer_analysis.groupby('competition_code').apply(process_and_plot)
Out[2]:

Top 30 stadiums with the most seats¶

In [3]:
max_seats_per_team = clubs_df.groupby('name')['stadium_seats'].max()
max_seats_per_team = max_seats_per_team.sort_values(ascending=False)

# Select top 30 teams
top_stadiums = max_seats_per_team.head(30)

# Create Plotly bar chart
data = [
    go.Bar(
        x=top_stadiums.index,
        y=top_stadiums.values,
        marker=dict(color='skyblue')
    )
]

layout = go.Layout(
    title='Top 30 Stadium Capacity',
    xaxis=dict(title='Club'),
    yaxis=dict(title='Number of Seats'),
    margin=dict(b=150),  # Adjust bottom margin to accommodate x-axis labels
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45, tickmode='array', tickvals=top_stadiums.index, ticktext=top_stadiums.index)

fig.show()
In [4]:
min_seats_per_team = clubs_df.groupby('name')['stadium_seats'].min()
min_seats_per_team = min_seats_per_team.sort_values(ascending=True)

# Select top 30 teams
top_stadiums = min_seats_per_team.head(30)

# Create Plotly bar chart
data = [
    go.Bar(
        x=top_stadiums.index,
        y=top_stadiums.values,
        marker=dict(color='lightgreen')
    )
]

layout = go.Layout(
    title='Top 30 Stadium Capacity (Lowest Seats Capacity)',
    xaxis=dict(title='Club'),
    yaxis=dict(title='Number of Seats'),
    margin=dict(b=150),  # Adjust bottom margin to accommodate x-axis labels
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45, tickmode='array', tickvals=top_stadiums.index, ticktext=top_stadiums.index)

fig.show()
In [5]:
# data analysis of the club with the highest national team players
In [6]:
# Sort by number of national team players and select top 10
top_clubs = clubs_df.sort_values(by='national_team_players', ascending=False).head(20)

# Create Plotly bar chart
data = [
    go.Bar(
        x=top_clubs['name'],
        y=top_clubs['national_team_players'],
        marker=dict(color='blue')
    )
]

layout = go.Layout(
    title='Top 20 Clubs Worldwide with Most National Team Players',
    xaxis=dict(title='Club'),
    yaxis=dict(title='Number of National Team Players'),
    margin=dict(b=150),  # Adjust bottom margin to accommodate x-axis labels
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)

# Update x-axis tick angle for better readability
fig.update_xaxes(tickangle=45)

# Show plot
fig.show()

Data analysis of the largest player club¶

In [7]:
largest_club = clubs_df.sort_values(by='squad_size', ascending=False).head(5)


data = [
    go.Bar(
        x=largest_club['name'],
        y=largest_club['squad_size'],
        marker=dict(color='green')
    )
]
layout = go.Layout(
    title='Top 5 Clubs with Largest Squad Size',
    xaxis=dict(title='Club'),
    yaxis=dict(title='Squad Size'),
    margin=dict(b=150),  # Adjust bottom margin to accommodate x-axis labels
    hovermode='closest',
)
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45)
fig.show()

Analysis of the clubs with the highest win rate and goal rate¶

In [8]:
import pandas as pd
import plotly.graph_objs as go

# Supponiamo che club_games_df e clubs_df siano già stati definiti e contengano i dati necessari

# Calcola il numero di vittorie per ciascun club
club_games_df['win'] = club_games_df['is_win'] == 1
wins_per_club = club_games_df.groupby('club_id')['win'].sum().reset_index()
wins_per_club.columns = ['club_id', 'num_wins']

# Calcola la media dei goal segnati per ciascun club
average_goals_per_club = club_games_df.groupby('club_id')['own_goals'].mean().reset_index()
average_goals_per_club.columns = ['club_id', 'average_goals']

# Unisci i dati delle vittorie e della media dei goal
club_stats = pd.merge(wins_per_club, average_goals_per_club, on='club_id')
club_stats = pd.merge(club_stats, clubs_df[['club_id', 'name']], on='club_id')

# Trova i top 5 club con il maggior numero di vittorie
top_5_clubs = club_stats.sort_values(by='num_wins', ascending=False).head(5)

# Crea un grafico a barre interattivo con Plotly
data = [
    go.Bar(
        x=top_5_clubs['name'],
        y=top_5_clubs['num_wins'],
        name='Number of Wins',
        marker=dict(color='blue')
    ),
    go.Bar(
        x=top_5_clubs['name'],
        y=top_5_clubs['average_goals'],
        name='Average Goals',
        marker=dict(color='green')
    )
]

layout = go.Layout(
    title='Top 5 Clubs by Number of Wins and Average Goals',
    xaxis=dict(title='Club'),
    yaxis=dict(title='Value'),
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)

# Mostra il grafico

fig.show()

Analysis of the club with the prizes player and the club with the most wins¶

In [9]:
import pandas as pd
import plotly.graph_objs as go
import plotly.subplots as sp

# Assumiamo che tu abbia già i dataframe players_df, clubs_df, appearances_df e matches_df

# Trovare il giocatore più pagato
max_market_value_player = players_df.loc[players_df['market_value_in_eur'].idxmax()]
max_market_value_team_id = max_market_value_player['current_club_id']

# Calcolare il numero totale di goal per ciascun team
total_goals_df = appearances_df.groupby('player_club_id')['goals'].sum().reset_index()
max_goals_team_id = total_goals_df.loc[total_goals_df['goals'].idxmax()]['player_club_id']

# Estrazione dei dati per i team interessati
max_market_value_team = clubs_df[clubs_df['club_id'] == max_market_value_team_id]
max_goals_team = clubs_df[clubs_df['club_id'] == max_goals_team_id]

# Funzione per estrarre dati per il grafico
def extract_team_data(team_df, team_id):
    # Calcolare le vittorie per il team
    team_wins =club_games_df[(club_games_df['club_id'] == team_id) & (club_games_df['is_win'] == 1)].shape[0]

    return {
        'squad_size': team_df['squad_size'].values[0],
        'average_age': team_df['average_age'].values[0],
        'yellow_cards': appearances_df[appearances_df['player_club_id'] == team_id]['yellow_cards'].sum(),
        'red_cards': appearances_df[appearances_df['player_club_id'] == team_id]['red_cards'].sum(),
        'wins': team_wins,
        'losses': 0  # Supponiamo che non abbiamo informazioni sulle sconfitte
    }

# Dati per i team più pagato e con più goal
max_market_value_team_data = extract_team_data(max_market_value_team, max_market_value_team_id)
max_goals_team_data = extract_team_data(max_goals_team, max_goals_team_id)

# Nome dei club
max_market_value_team_name = max_market_value_team['name'].values[0]
max_goals_team_name = max_goals_team['name'].values[0]

# Creare il grafico interattivo
fig = sp.make_subplots(
    rows=1, 
    cols=2, 
    subplot_titles=(f"Team del Giocatore più Pagato ({max_market_value_team_name})", f"Team con più Goal ({max_goals_team_name})")
)

# Aggiungere dati al grafico
fig.add_trace(go.Bar(x=list(max_market_value_team_data.keys()), y=list(max_market_value_team_data.values()), name=max_market_value_team_name), row=1, col=1)
fig.add_trace(go.Bar(x=list(max_goals_team_data.keys()), y=list(max_goals_team_data.values()), name=max_goals_team_name), row=1, col=2)

# Aggiornare il layout del grafico
fig.update_layout(title_text="Confronto tra Team", showlegend=False)

# Visualizzare il grafico
fig.show()

data analysis of the club with the highest net transfer record for each competition with the most wins club for each competition¶

In [10]:
# 1. Calcoliamo i cartellini rossi e gialli per ogni squadra
red_cards = appearances_df.groupby('player_current_club_id')['red_cards'].sum()
yellow_cards = appearances_df.groupby('player_current_club_id')['yellow_cards'].sum()

# 2. Calcoliamo il numero di vittorie per ogni squadra
wins = club_games_df.groupby('club_id')['is_win'].sum()

# 3. Otteniamo la dimensione della squadra (squad_size)
squad_size = clubs_df.set_index('club_id')['squad_size']

# Uniamo tutte le metriche in un unico DataFrame per semplificare il plotting
data = pd.DataFrame({
    'Red Cards': red_cards,
    'Yellow Cards': yellow_cards,
    'Wins': wins,
    'Squad Size': squad_size
}).fillna(0)  # Gestione dei NaN nel caso ci siano dati mancanti

# Uniamo anche il nome delle squadre per renderlo più leggibile nei grafici
data = data.join(clubs_df.set_index('club_id')['name'])

# Troviamo la squadra con più trasferimenti
team_with_most_transfers = data['Squad Size'].idxmax()

# Troviamo la squadra più vincente
team_with_most_wins = data['Wins'].idxmax()
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Filtra i dati per le due squadre principali di ogni campionato
team1_data = data[data.index == team_with_most_transfers]
team2_data = data[data.index == team_with_most_wins]

# Crea i subplot
fig = make_subplots(rows=2, cols=2, subplot_titles=("Red Cards", "Yellow Cards", "Wins", "Squad Size"))

# Aggiungi i grafici per ciascuna metrica
fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Red Cards'], name='Most Transfers'), row=1, col=1)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Red Cards'], name='Most Wins'), row=1, col=1)

fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Yellow Cards'], name='Most Transfers'), row=1, col=2)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Yellow Cards'], name='Most Wins'), row=1, col=2)

fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Wins'], name='Most Transfers'), row=2, col=1)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Wins'], name='Most Wins'), row=2, col=1)

fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Squad Size'], name='Most Transfers'), row=2, col=2)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Squad Size'], name='Most Wins'), row=2, col=2)

# Aggiungi titoli e nomi degli assi
fig.update_layout(title='Confronto tra Squadra con più trasferimenti e più vincente',
                  xaxis_title='Squadra',
                  yaxis_title='Valore',
                  barmode='group')

# Mostra il grafico
fig.show()
In [11]:
# Fusione dei dati delle partite con i dati dei club per ottenere i nomi dei club
merge_club_games_df = pd.merge(club_games_df, clubs_df, on='club_id')

# Effettua il merge con il DataFrame delle apparizioni basato su game_id o altre chiavi di collegamento appropriate
merge_club_games_appearances_df = pd.merge(merge_club_games_df, appearances_df[['game_id', 'yellow_cards', 'red_cards']], on='game_id', how='left')

# Conteggio delle vittorie per ciascun club in ogni competizione
wins_count_per_club = merge_club_games_appearances_df[merge_club_games_appearances_df['is_win'] == 1].groupby(['domestic_competition_id', 'club_id']).size().reset_index(name='wins_count')

# Trova il club con il massimo numero di vittorie per ogni competizione
clubs_with_most_wins_per_competition = wins_count_per_club.loc[wins_count_per_club.groupby('domestic_competition_id')['wins_count'].idxmax()]

# Inizializza un elenco di figure per ogni competizione
figures = []

# Ciclo su ogni competizione e crea una figura separata per ciascuna
for comp_id in clubs_with_most_wins_per_competition['domestic_competition_id']:
    # Trova il nome della squadra con più trasferimenti e più vittorie per questa competizione
    team_most_transfers = clubs_df[clubs_df['domestic_competition_id'] == comp_id]['club_id'].iloc[0]
    team_most_wins = clubs_with_most_wins_per_competition[clubs_with_most_wins_per_competition['domestic_competition_id'] == comp_id]['club_id'].iloc[0]
    
    # Filtra i dati per le due squadre di interesse
    team1_data = merge_club_games_appearances_df[(merge_club_games_appearances_df['club_id'] == team_most_transfers) & (merge_club_games_appearances_df['domestic_competition_id'] == comp_id)].iloc[0]
    team2_data = merge_club_games_appearances_df[(merge_club_games_appearances_df['club_id'] == team_most_wins) & (merge_club_games_appearances_df['domestic_competition_id'] == comp_id)].iloc[0]
    
    # Crea una nuova figura per la competizione corrente
    fig = make_subplots(rows=1, cols=5, subplot_titles=[f"Competizione {comp_id} - {clubs_df.loc[clubs_df['club_id'] == team_most_transfers, 'name'].iloc[0]} vs {clubs_df.loc[clubs_df['club_id'] == team_most_wins, 'name'].iloc[0]}"])
    
    # Aggiungi i grafici per ogni metrica sulla competizione corrente
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team1_data['yellow_cards'], team2_data['yellow_cards']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Yellow Cards', marker_color='gold', legendgroup='team1'), row=1, col=1)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team1_data['red_cards'], team2_data['red_cards']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Red Cards', marker_color='indianred', legendgroup='team1'), row=1, col=2)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team1_data['is_win'], team2_data['is_win']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Wins', marker_color='mediumseagreen', legendgroup='team1'), row=1, col=3)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team1_data['total_market_value'], team2_data['total_market_value']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Total Market Value', marker_color='royalblue', legendgroup='team1'), row=1, col=4)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team1_data['squad_size'], team2_data['squad_size']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Squad Size', marker_color='orange', legendgroup='team1'), row=1, col=5)
    
    # Aggiungi i grafici per il secondo team con la stessa metrica
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team2_data['yellow_cards'], team1_data['yellow_cards']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Yellow Cards', marker_color='gold', legendgroup='team2'), row=1, col=1)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team2_data['red_cards'], team1_data['red_cards']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Red Cards', marker_color='indianred', legendgroup='team2'), row=1, col=2)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team2_data['is_win'], team1_data['is_win']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Wins', marker_color='mediumseagreen', legendgroup='team2'), row=1, col=3)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team2_data['total_market_value'], team1_data['total_market_value']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Total Market Value', marker_color='royalblue', legendgroup='team2'), row=1, col=4)
    
    fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
                         y=[team2_data['squad_size'], team1_data['squad_size']],
                         name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Squad Size', marker_color='orange', legendgroup='team2'), row=1, col=5)
    
    # Aggiungi la figura alla lista delle figure
    figures.append(fig)

# Aggiungi i layout e mostra ogni figura separatamente
for fig in figures:
    fig.update_layout(title='Confronto tra Squadra con più Trasferimenti e Squadra con più Vittorie',
                      showlegend=True, height=400, width=1000, barmode='group')
    fig.update_xaxes(title_text="Metrica", row=1, col=3)
    fig.update_yaxes(title_text="Valore", row=1, col=1)
    fig.show()